import pandas as pd
import jieba
from config import wconfig


def read_lines(filename, is_zh=False):
    with open(filename, 'r') as f:
        lines = [sentence for line in f.readlines() for sentence in get_list(line, is_zh) if line != '\n']
    print(lines)
    return lines


def get_list(line, is_zh):
    return line.split('。') if is_zh else line.split('.')


def split_sentence(lines_list, out_filename, is_zh=False):
    if is_zh:
        lines_dict = {idx: [line, '/'.join(jieba.cut(line))] for idx, line in enumerate(lines_list)}
    else:
        lines_dict = {idx: [line, '/'.join(line.split())] for idx, line in enumerate(lines_list)}

    data = pd.DataFrame.from_dict(lines_dict, orient='index')
    data.to_csv(out_filename, index_label="order", header=['sentence', 'split_sentence'])


if __name__ == '__main__':
    zh_data_path = wconfig.VOCAB_ZH_TEXT_PATH
    en_data_path = wconfig.VOCAB_EN_TEXT_PATH

    zh_out_path = wconfig.SENTENCE_ZH_TEXT_PATH
    en_out_path = wconfig.SENTENCE_EN_TEXT_PATH

    split_sentence(read_lines(zh_data_path, is_zh=True), zh_out_path, is_zh=True)
    split_sentence(read_lines(en_data_path, is_zh=False), en_out_path, is_zh=False)